# raw counts downloaded from GEO
# https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE205555
counts_csv <- "tables/counts.csv"
counts_tpm_csv <- "tables/counts_tpm.csv"
if (file.exists(counts_csv)){
counts <- read_csv(counts_csv) %>% column_to_rownames("ensembl_gene_id")
count_tpm <- read_csv(counts_tpm)
}else{
files <- fs::dir_ls(path = "../data/input_geo", glob = "*exonCounts.txt")
counts <- readr::read_tsv(files, id = "path", col_names = c("ensembl_gene_id", "raw_counts"))
df_split <- str_split_fixed(counts$path, "_", 13) %>% as.data.frame()
counts$sample <- df_split$V2
counts <- counts %>%
mutate(sample = str_replace(sample, "geo/", "")) %>%
dplyr::relocate(sample) %>% dplyr::select(-path) %>%
pivot_wider(names_from = "sample", values_from = "raw_counts")
gene_length <- read_tsv("tables/GC_lengths.tsv") %>% arrange(ensembl_gene_id)
counts <- counts %>% arrange(ensembl_gene_id)
gene_ids <- intersect(counts$ensembl_gene_id, gene_length$ensembl_gene_id)
v_len <- gene_length %>% dplyr::filter(ensembl_gene_id %in% gene_ids)
counts <- counts %>% dplyr::filter(ensembl_gene_id %in% gene_ids)
write_csv(counts, counts_csv)
counts <- counts %>% column_to_rownames("ensembl_gene_id")
x <- counts / v_len$Length
counts_tpm <- t(t(x) * 1e6 / colSums(x)) %>% as.data.frame() %>% round(2) %>%
rownames_to_column("ensembl_gene_id") %>% write_csv(counts_tpm_csv)
}
## Error in standardise_path(file): object 'counts_tpm' not found
# Load the data and metadata
metadata <- read_csv("tables/metadata.csv") %>% column_to_rownames(var = "sample_id")
protein_coding_genes <- read_csv("tables/ensembl_w_description.protein_coding.csv")
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
Here we subset protein coding genes.
counts <- counts(dds, normalized = TRUE)
design <- as.data.frame(colData(dds))
degCheckFactors(counts)
res <- results(dds)
degQC(counts, design[["treatment"]], pvalue = res[["pvalue"]])
degQC(counts, design[["fibroblast_line"]], pvalue = res[["pvalue"]])
mdata <- colData(dds) %>% as.data.frame() %>%
dplyr::select(treatment, fibroblast_line)
resCov <- degCovariates(log2(counts(dds)+0.5), mdata)
cor <- degCorCov(mdata)
# Use the DESeq2 function
plotPCA(rld, intgroup = c("treatment")) + geom_label_repel(aes(label = name)) + theme_bw()
# Use the DESeq2 function
plotPCA(rld, intgroup = c("fibroblast_line")) + geom_label_repel(aes(label = name))
# Correlation matrix
rld_cor <- cor(rld_mat)
# Create annotation file for samples
annotation <- metadata[, c("treatment", "fibroblast_line")]
# Change colors
heat.colors <- brewer.pal(6, "Blues")
# Plot heatmap
pheatmap(rld_cor,
annotation = annotation,
border = NA,
fontsize = 20)
rv <- rowVars(rld_mat)
rv <- order(rv, decreasing = TRUE) %>% head(1000)
rld_mat_1000 <- rld_mat[rv,]
annotation <- metadata[, c("treatment", "fibroblast_line")]
# Change colors
heat.colors <- brewer.pal(6, "Blues")
rld_cor <- cor(rld_mat_1000)
# Plot heatmap
pheatmap(rld_cor,
annotation = annotation,
border = NA,
fontsize = 20)
rld.sub <- rld[ , rld$treatment %in% c("adapalene", "DMSO") ]
plotPCA(rld.sub, intgroup = c("treatment")) + geom_label_repel(aes(label = name)) + theme_bw()
plotPCA(rld.sub, intgroup = c("fibroblast_line")) + geom_label_repel(aes(label = name)) + theme_bw()
contrast <- c("treatment", "adapalene", "DMSO")
resTreatment <- results(dds, contrast = contrast, alpha = 0.05)
length(which(resTreatment$padj < 0.05))
## [1] 4232
# Add annotations
resTreatment_tb <- resTreatment %>%
data.frame() %>%
rownames_to_column(var = "gene") %>%
as_tibble() %>%
left_join(gene_symbol, by = c("gene" = "gene_id"))
resTreatment_tb_significant <- dplyr::filter(resTreatment_tb, padj < 0.05) %>%
dplyr::filter(abs(log2FoldChange) > 1) %>%
comb_de_result_table()
samples_control <- metadata %>% rownames_to_column("ensembl_gene_id") %>%
dplyr::filter(treatment == "DMSO") %>% pull("ensembl_gene_id")
tpm_control <- get_counts_for_samples(counts_tpm, samples_control, "DMSO_mean_tpm")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': error in evaluating the argument 'x' in selecting a method for function 'rowMeans': object 'counts_tpm' not found
samples_effect <- metadata %>% dplyr::filter(treatment == "adapalene") %>% row.names()
tpm_effect <- get_counts_for_samples(counts_tpm, samples_effect, "adapalene_tpm")
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': error in evaluating the argument 'x' in selecting a method for function 'rowMeans': object 'counts_tpm' not found
tpm_counts <- tpm_effect %>%
left_join(tpm_control,
by = c("ensembl_gene_id" = "ensembl_gene_id"))
## Error in left_join(., tpm_control, by = c(ensembl_gene_id = "ensembl_gene_id")): object 'tpm_effect' not found
resTreatment_tb_significant <- resTreatment_tb_significant %>%
left_join(tpm_counts, by = c("gene" = "ensembl_gene_id")) %>%
arrange(log2FoldChange)
## Error in is.data.frame(y): object 'tpm_counts' not found
write_xlsx(list(T2.DE_adapalene = resTreatment_tb_significant),
"tables/T2.DE_adapalene.xlsx")
# Separate into up and down-regulated gene sets
sigTreatment_up <- rownames(resTreatment)[which(resTreatment$padj < 0.01 & resTreatment$log2FoldChange > 0)]
sigTreatment_down <- rownames(resTreatment)[which(resTreatment$padj < 0.01 & resTreatment$log2FoldChange < 0)]
Gene example
d <- plotCounts(dds,
gene = "ENSG00000198846",
intgroup = "treatment",
returnData = TRUE)
ggplot(d, aes(x = treatment, y = count, color = treatment)) +
geom_point(position = position_jitter(w = 0.1, h = 0)) +
geom_text_repel(aes(label = rownames(d))) +
theme_bw(base_size = 10) +
ggtitle("TOX") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_y_log10()
# Create a matrix of normalized expression
sig_up <- resTreatment_tb_significant %>% arrange(-log2FoldChange) %>% head(50) %>% pull(gene)
sig_down <- resTreatment_tb_significant %>% arrange(log2FoldChange) %>% head(50) %>% pull(gene)
sig <- c(sig_up, sig_down)
row_annotation <- gene_symbol %>%
as_tibble() %>%
dplyr::filter(gene_id %in% sig)
plotmat <- counts_tpm %>% column_to_rownames("ensembl_gene_id") %>%
dplyr::select(any_of(c(samples_control, samples_effect)))
## Error in is.data.frame(.data): object 'counts_tpm' not found
plotmat <- plotmat[c(sig_up, sig_down),] %>% as.data.frame() %>%
rownames_to_column(var = "ensembl_gene_id") %>%
left_join(gene_symbol, by = c("ensembl_gene_id" = "gene_id")) %>%
drop_na(symbol)
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'as.data.frame': object 'plotmat' not found
plotmat$ensembl_gene_id <- NULL
## Error in plotmat$ensembl_gene_id <- NULL: object 'plotmat' not found
plotmat <- plotmat %>% column_to_rownames(var = "symbol") %>% as.matrix()
## Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'as.matrix': object 'plotmat' not found
# Color palette
heat.colors <- brewer.pal(6, "YlOrRd")
# Plot heatmap
# color = heat.colors,
pheatmap(plotmat, scale = "row",
show_rownames = TRUE,
border = FALSE,
annotation = metadata[, c("treatment"), drop = FALSE],
main = "Top 50 Up- and Down- regulated genes in treatment: adapalene vs DMSO",
fontsize = 20)
## Error in rownames(mat): object 'plotmat' not found
sessionInfo()
## R version 4.2.2 (2022-10-31)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Fedora Linux 37 (Workstation Edition)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/libflexiblas.so.3.3
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] writexl_1.4.1 ggplotify_0.1.0
## [3] knitr_1.41 ggrepel_0.9.2
## [5] tximport_1.26.1 DEGreport_1.34.0
## [7] pheatmap_1.0.12 DESeq2_1.38.2
## [9] SummarizedExperiment_1.28.0 MatrixGenerics_1.10.0
## [11] matrixStats_0.63.0 RColorBrewer_1.1-3
## [13] ensembldb_2.22.0 AnnotationFilter_1.22.0
## [15] GenomicFeatures_1.50.3 AnnotationDbi_1.60.0
## [17] Biobase_2.58.0 GenomicRanges_1.50.2
## [19] GenomeInfoDb_1.34.9 IRanges_2.32.0
## [21] S4Vectors_0.36.1 AnnotationHub_3.6.0
## [23] BiocFileCache_2.6.0 dbplyr_2.2.1
## [25] BiocGenerics_0.44.0 forcats_0.5.2
## [27] stringr_1.5.0 dplyr_1.0.10
## [29] purrr_1.0.0 readr_2.1.3
## [31] tidyr_1.2.1 tibble_3.1.8
## [33] ggplot2_3.4.0 tidyverse_1.3.2
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 tidyselect_1.2.0
## [3] RSQLite_2.2.20 grid_4.2.2
## [5] BiocParallel_1.32.5 munsell_0.5.0
## [7] codetools_0.2-18 withr_2.5.0
## [9] colorspace_2.0-3 filelock_1.0.2
## [11] highr_0.10 rstudioapi_0.14
## [13] labeling_0.4.2 GenomeInfoDbData_1.2.9
## [15] mnormt_2.1.1 bit64_4.0.5
## [17] farver_2.1.1 vctrs_0.5.1
## [19] generics_0.1.3 xfun_0.36
## [21] timechange_0.1.1 R6_2.5.1
## [23] doParallel_1.0.17 clue_0.3-63
## [25] locfit_1.5-9.7 bitops_1.0-7
## [27] cachem_1.0.6 reshape_0.8.9
## [29] gridGraphics_0.5-1 DelayedArray_0.24.0
## [31] assertthat_0.2.1 promises_1.2.0.1
## [33] BiocIO_1.8.0 scales_1.2.1
## [35] vroom_1.6.0 googlesheets4_1.0.1
## [37] gtable_0.3.1 rlang_1.0.6
## [39] MatrixModels_0.5-2 splines_4.2.2
## [41] GlobalOptions_0.1.2 rtracklayer_1.58.0
## [43] lazyeval_0.2.2 gargle_1.2.0
## [45] broom_1.0.2 BiocManager_1.30.19
## [47] yaml_2.3.6 modelr_0.1.10
## [49] backports_1.4.1 httpuv_1.6.7
## [51] tools_4.2.2 psych_2.2.9
## [53] logging_0.10-108 ellipsis_0.3.2
## [55] jquerylib_0.1.4 ggdendro_0.1.23
## [57] Rcpp_1.0.9 plyr_1.8.8
## [59] progress_1.2.2 zlibbioc_1.44.0
## [61] RCurl_1.98-1.8 prettyunits_1.1.1
## [63] GetoptLong_1.0.5 cowplot_1.1.1
## [65] haven_2.5.1 cluster_2.1.4
## [67] fs_1.5.2 magrittr_2.0.3
## [69] SparseM_1.81 circlize_0.4.15
## [71] reprex_2.0.2 googledrive_2.0.0
## [73] ProtGenerics_1.30.0 hms_1.1.2
## [75] mime_0.12 evaluate_0.19
## [77] xtable_1.8-4 XML_3.99-0.13
## [79] readxl_1.4.1 shape_1.4.6
## [81] compiler_4.2.2 biomaRt_2.54.0
## [83] crayon_1.5.2 htmltools_0.5.4
## [85] later_1.3.0 tzdb_0.3.0
## [87] geneplotter_1.76.0 lubridate_1.9.0
## [89] DBI_1.1.3 ComplexHeatmap_2.14.0
## [91] MASS_7.3-58.1 rappdirs_0.3.3
## [93] Matrix_1.6-1 cli_3.5.0
## [95] parallel_4.2.2 pkgconfig_2.0.3
## [97] GenomicAlignments_1.34.0 xml2_1.3.3
## [99] foreach_1.5.2 annotate_1.76.0
## [101] bslib_0.4.2 XVector_0.38.0
## [103] rvest_1.0.3 yulab.utils_0.0.6
## [105] digest_0.6.31 ConsensusClusterPlus_1.62.0
## [107] Biostrings_2.66.0 rmarkdown_2.19
## [109] cellranger_1.1.0 edgeR_3.40.1
## [111] restfulr_0.0.15 curl_4.3.2
## [113] shiny_1.7.4 Rsamtools_2.14.0
## [115] quantreg_5.97 rjson_0.2.21
## [117] lifecycle_1.0.3 nlme_3.1-160
## [119] jsonlite_1.8.4 limma_3.54.0
## [121] fansi_1.0.3 pillar_1.8.1
## [123] lattice_0.20-45 survival_3.4-0
## [125] KEGGREST_1.38.0 fastmap_1.1.0
## [127] httr_1.4.4 interactiveDisplayBase_1.36.0
## [129] glue_1.6.2 png_0.1-8
## [131] iterators_1.0.14 BiocVersion_3.16.0
## [133] bit_4.0.5 stringi_1.7.8
## [135] sass_0.4.4 blob_1.2.3
## [137] memoise_2.0.1